1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.solr.handler.component;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.EnumSet;
26 import java.util.IdentityHashMap;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31
32 import org.apache.commons.lang.StringUtils;
33 import org.apache.lucene.document.FieldType.NumericType;
34 import org.apache.lucene.index.LeafReaderContext;
35 import org.apache.lucene.queries.function.FunctionQuery;
36 import org.apache.lucene.queries.function.ValueSource;
37 import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
38 import org.apache.lucene.queries.function.valuesource.QueryValueSource;
39 import org.apache.lucene.search.Query;
40 import org.apache.solr.common.SolrException;
41 import org.apache.solr.common.SolrException.ErrorCode;
42 import org.apache.solr.common.params.CommonParams;
43 import org.apache.solr.common.params.ModifiableSolrParams;
44 import org.apache.solr.common.params.SolrParams;
45 import org.apache.solr.common.params.StatsParams;
46 import org.apache.solr.common.util.StrUtils;
47 import org.apache.solr.request.DocValuesStats;
48 import org.apache.solr.request.SolrQueryRequest;
49 import org.apache.solr.schema.IndexSchema;
50 import org.apache.solr.schema.SchemaField;
51 import org.apache.solr.search.DocIterator;
52 import org.apache.solr.search.DocSet;
53 import org.apache.solr.search.QParser;
54 import org.apache.solr.search.QParserPlugin;
55 import org.apache.solr.search.QueryParsing;
56 import org.apache.solr.search.SolrIndexSearcher;
57 import org.apache.solr.search.SyntaxError;
58 import org.apache.solr.util.hll.HLL;
59 import org.apache.solr.util.hll.HLLType;
60
61 import com.google.common.hash.Hashing;
62 import com.google.common.hash.HashFunction;
63
64
65
66
67
68
69
70 public class StatsField {
71
72
73
74
75
76
77
78
79
80
81 public static enum Stat {
82 min(true),
83 max(true),
84 missing(true),
85 sum(true),
86 count(true),
87 mean(false, sum, count),
88 sumOfSquares(true),
89 stddev(false, sum, count, sumOfSquares),
90 distinctValues(true),
91 countDistinct(false, distinctValues),
92 percentiles(true){
93
94 boolean parseParams(StatsField sf) {
95 String percentileParas = sf.localParams.get(this.name());
96 if (percentileParas != null) {
97 List<Double> percentiles = new ArrayList<Double>();
98 try {
99 for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
100 percentiles.add(Double.parseDouble(percentile));
101 }
102 if (!percentiles.isEmpty()) {
103 sf.percentilesList.addAll(percentiles);
104 sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression",
105 sf.tdigestCompression);
106 return true;
107 }
108 } catch (NumberFormatException e) {
109 throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
110 + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
111 + e.getMessage(), e);
112 }
113
114 }
115 return false;
116 }
117 },
118 cardinality(true) {
119
120 boolean parseParams(StatsField sf) {
121 try {
122 sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
123 return (null != sf.hllOpts);
124 } catch (Exception e) {
125 throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
126 + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
127 + e.getMessage(), e);
128 }
129 }
130 };
131
132 private final List<Stat> distribDeps;
133
134
135
136
137
138
139
140
141
142
143
144 Stat(boolean selfDep, Stat... deps) {
145 distribDeps = new ArrayList<Stat>(deps.length+1);
146 distribDeps.addAll(Arrays.asList(deps));
147 if (selfDep) {
148 distribDeps.add(this);
149 }
150 }
151
152
153
154
155 public static Stat forName(String paramKey) {
156 try {
157 return Stat.valueOf(paramKey);
158 } catch (IllegalArgumentException e) {
159 return null;
160 }
161 }
162
163
164
165
166
167
168
169 public EnumSet<Stat> getDistribDeps() {
170 return EnumSet.copyOf(this.distribDeps);
171 }
172
173
174
175
176
177 boolean parseParams(StatsField sf) {
178 return sf.localParams.getBool(this.name(), false);
179 }
180
181 }
182
183
184
185
186
187
188 private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues);
189
190
191
192
193 public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet
194 (EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));
195
196 private final SolrIndexSearcher searcher;
197 private final ResponseBuilder rb;
198 private final String originalParam;
199 private final SolrParams localParams;
200 private final ValueSource valueSource;
201 private final SchemaField schemaField;
202 private final String key;
203 private final boolean topLevelCalcDistinct;
204 private final String[] facets;
205 private final List<String> tagList;
206 private final List<String> excludeTagList;
207 private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
208 private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
209 private final List<Double> percentilesList= new ArrayList<Double>();
210 private final boolean isShard;
211
212 private double tdigestCompression = 100.0D;
213 private HllOptions hllOpts;
214
215
216
217
218
219 public StatsField(ResponseBuilder rb, String statsParam) {
220 this.rb = rb;
221 this.searcher = rb.req.getSearcher();
222 this.originalParam = statsParam;
223
224 SolrParams params = rb.req.getParams();
225 try {
226 isShard = params.getBool("isShard", false);
227 SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
228 if (null == localParams) {
229
230 ModifiableSolrParams customParams = new ModifiableSolrParams();
231 customParams.add(QueryParsing.V, originalParam);
232 localParams = customParams;
233 }
234
235 this.localParams = localParams;
236
237 String parserName = localParams.get(QueryParsing.TYPE);
238 SchemaField sf = null;
239 ValueSource vs = null;
240
241 if ( StringUtils.isBlank(parserName) ) {
242
243
244 sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));
245
246 } else {
247
248
249
250
251 QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
252 QParser qp = qplug.createParser(localParams.get(QueryParsing.V),
253 localParams, params, rb.req);
254
255
256 vs = extractValueSource(qp.parse());
257
258
259
260
261 sf = extractSchemaField(vs, searcher.getSchema());
262 if (null != sf) {
263 vs = null;
264 }
265 }
266
267 assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null";
268
269 this.schemaField = sf;
270 this.valueSource = vs;
271
272 } catch (SyntaxError e) {
273 throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " +
274 StatsParams.STATS_FIELD + ": " + originalParam + " due to: "
275 + e.getMessage(), e);
276 }
277
278
279 this.key = localParams.get(CommonParams.OUTPUT_KEY,
280
281 localParams.get(CommonParams.VALUE,
282
283 originalParam));
284
285 this.topLevelCalcDistinct = null == schemaField
286 ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
287 : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);
288
289 populateStatsSets();
290
291 String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
292 this.facets = (null == facets) ? new String[0] : facets;
293 String tagStr = localParams.get(CommonParams.TAG);
294 this.tagList = (null == tagStr)
295 ? Collections.<String>emptyList()
296 : StrUtils.splitSmart(tagStr,',');
297
298
299 String excludeStr = localParams.get(CommonParams.EXCLUDE);
300 this.excludeTagList = (null == excludeStr)
301 ? Collections.<String>emptyList()
302 : StrUtils.splitSmart(excludeStr,',');
303
304 assert ( (null == this.valueSource) ^ (null == this.schemaField) )
305 : "exactly one of valueSource & schemaField must be null";
306 }
307
308
309
310
311
312
313
314
315 private static ValueSource extractValueSource(Query q) {
316 return (q instanceof FunctionQuery) ?
317
318 ((FunctionQuery) q).getValueSource() :
319
320 new QueryValueSource(q, 0.0F);
321 }
322
323
324
325
326
327
328
329
330
331
332 private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
333 if (vs instanceof FieldCacheSource) {
334 String fieldName = ((FieldCacheSource)vs).getField();
335 return schema.getField(fieldName);
336 }
337 return null;
338 }
339
340
341
342
343
344 public String getOutputKey() {
345 return key;
346 }
347
348
349
350
351
352
353
354
355
356 public DocSet computeBaseDocSet() throws IOException {
357
358 DocSet docs = rb.getResults().docSet;
359 Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags");
360
361 if (excludeTagList.isEmpty() || null == tagMap) {
362
363
364 return docs;
365 }
366
367 IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>();
368 for (String excludeTag : excludeTagList) {
369 Object olst = tagMap.get(excludeTag);
370
371 if (!(olst instanceof Collection)) continue;
372 for (Object o : (Collection<?>)olst) {
373 if (!(o instanceof QParser)) continue;
374 QParser qp = (QParser)o;
375 try {
376 excludeSet.put(qp.getQuery(), Boolean.TRUE);
377 } catch (SyntaxError e) {
378
379
380 throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " +
381 originalParam + " due to: " + e.getMessage(), e);
382 }
383 }
384 }
385 if (excludeSet.size() == 0) return docs;
386
387 List<Query> qlist = new ArrayList<Query>();
388
389
390 if (!excludeSet.containsKey(rb.getQuery())) {
391 qlist.add(rb.getQuery());
392 }
393
394
395 if (rb.getFilters() != null) {
396 for (Query q : rb.getFilters()) {
397 if (!excludeSet.containsKey(q)) {
398 qlist.add(q);
399 }
400 }
401 }
402
403
404 return searcher.getDocSet(qlist);
405 }
406
407
408
409
410
411
412 public StatsValues computeLocalStatsValues(DocSet base) throws IOException {
413
414 if (statsToCalculate.isEmpty()) {
415
416
417 return StatsValuesFactory.createStatsValues(this);
418 }
419
420 if (null != schemaField
421 && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {
422
423
424 return DocValuesStats.getCounts(searcher, this, base, facets);
425 } else {
426
427
428 return computeLocalValueSourceStats(base);
429 }
430 }
431
432 private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {
433
434 IndexSchema schema = searcher.getSchema();
435
436 final StatsValues allstats = StatsValuesFactory.createStatsValues(this);
437
438 List<FieldFacetStats> facetStats = new ArrayList<>();
439 for( String facetField : facets ) {
440 SchemaField fsf = schema.getField(facetField);
441
442 if ( fsf.multiValued()) {
443 throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
444 "Stats can only facet on single-valued fields, not: " + facetField );
445 }
446
447 facetStats.add(new FieldFacetStats(searcher, fsf, this));
448 }
449
450 final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
451 LeafReaderContext ctx = null;
452 for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
453 final int doc = docsIt.nextDoc();
454 if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
455
456 do {
457 ctx = ctxIt.next();
458 } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
459 assert doc >= ctx.docBase;
460
461
462 allstats.setNextReader(ctx);
463 for (FieldFacetStats f : facetStats) {
464 f.setNextReader(ctx);
465 }
466 }
467
468
469 allstats.accumulate(doc - ctx.docBase);
470 for (FieldFacetStats f : facetStats) {
471 f.facet(doc - ctx.docBase);
472 }
473 }
474
475 for (FieldFacetStats f : facetStats) {
476 allstats.addFacet(f.name, f.facetStatsValues);
477 }
478 return allstats;
479 }
480
481
482
483
484
485 public SolrIndexSearcher getSearcher() {
486
487
488 return searcher;
489 }
490
491
492
493
494
495
496
497 public SchemaField getSchemaField() {
498 return schemaField;
499 }
500
501
502
503
504
505
506
507 public ValueSource getValueSource() {
508 return valueSource;
509 }
510
511 public List<String> getTagList() {
512 return tagList;
513 }
514
515 public String toString() {
516 return "StatsField<" + originalParam + ">";
517 }
518
519
520
521
522
523
524 private void populateStatsSets() {
525 boolean statSpecifiedByLocalParam = false;
526
527 Iterator<String> itParams = localParams.getParameterNamesIterator();
528
529 while (itParams.hasNext()) {
530 String paramKey = itParams.next();
531 Stat stat = Stat.forName(paramKey);
532 if (stat != null) {
533 statSpecifiedByLocalParam = true;
534 if (stat.parseParams(this)) {
535 statsInResponse.add(stat);
536 }
537 }
538 }
539
540
541 if ( ! ( statSpecifiedByLocalParam
542
543 || localParams.getBool("calcdistinct", false) ) ) {
544 statsInResponse.addAll(DEFAULT_STATS);
545 }
546
547
548
549 if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
550 for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
551
552 if (localParams.getBool(stat.name(), true)) {
553 statsInResponse.add(stat);
554 }
555 }
556 }
557
558 for (Stat stat : statsInResponse) {
559 statsToCalculate.addAll(stat.getDistribDeps());
560 }
561 }
562
563 public boolean calculateStats(Stat stat) {
564 return statsToCalculate.contains(stat);
565 }
566
567 public boolean includeInResponse(Stat stat) {
568 if (isShard) {
569 return statsToCalculate.contains(stat);
570 }
571
572 if (statsInResponse.contains(stat)) {
573 return true;
574 }
575 return false;
576 }
577
578 public List<Double> getPercentilesList() {
579 return percentilesList;
580 }
581
582 public boolean getIsShard() {
583 return isShard;
584 }
585
586 public double getTdigestCompression() {
587 return tdigestCompression;
588 }
589
590 public HllOptions getHllOptions() {
591 return hllOpts;
592 }
593
594
595
596
597
598
599
600 public static final class HllOptions {
601 final HashFunction hasher;
602
603
604
605
606
607
608
609
610
611
612
613
614 final int log2m;
615 final int regwidth;
616
617 final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";
618
619 private HllOptions(int log2m, int regwidth, HashFunction hasher) {
620 this.log2m = log2m;
621 this.regwidth = regwidth;
622 this.hasher = hasher;
623 }
624
625
626
627
628
629
630
631
632 public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
633 throws SolrException {
634
635 String cardinalityOpt = localParams.get(Stat.cardinality.name());
636 if (StringUtils.isBlank(cardinalityOpt)) {
637 return null;
638 }
639
640 final NumericType hashableNumType = getHashableNumericType(field);
641
642
643 int log2m = 13;
644 int regwidth = 6;
645
646 if (NumericType.FLOAT.equals(hashableNumType) || NumericType.INT.equals(hashableNumType)) {
647
648 regwidth--;
649
650
651
652 }
653
654
655
656
657
658
659
660 try {
661
662 final double accuracyOpt = Double.parseDouble(cardinalityOpt);
663
664
665
666
667
668 if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
669 throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
670 }
671
672
673 log2m = HLL.MINIMUM_LOG2M_PARAM
674 + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));
675
676
677
678
679 final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
680 regwidth = MIN_HUERISTIC_REGWIDTH
681 + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));
682
683 } catch (NumberFormatException nfe) {
684
685 if (! localParams.getBool(Stat.cardinality.name(), false)) {
686 return null;
687 }
688 }
689
690
691 log2m = localParams.getInt("hllLog2m", log2m);
692 regwidth = localParams.getInt("hllRegwidth", regwidth);
693
694
695 if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
696 throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " +
697 HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
698 + " (" + log2m +")");
699 }
700 if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
701 throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " +
702 HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
703 }
704
705 HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();
706
707 if (null == hasher) {
708
709
710 if (null == field || !NumericType.LONG.equals(field.getType().getNumericType())) {
711 throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
712 }
713 }
714
715
716 return new HllOptions(log2m, regwidth, hasher);
717 }
718
719 public int getLog2m() {
720 return log2m;
721 }
722
723 public int getRegwidth() {
724 return regwidth;
725 }
726
727 public HashFunction getHasher() {
728 return hasher;
729 }
730 public HLL newHLL() {
731
732
733
734
735
736 return new HLL(getLog2m(), getRegwidth(), -1 ,
737 false , HLLType.EMPTY);
738
739 }
740 }
741
742
743
744
745
746
747
748 private static NumericType getHashableNumericType(SchemaField field) {
749 if (null == field) {
750 return NumericType.FLOAT;
751 }
752 final NumericType result = field.getType().getNumericType();
753 return null == result ? NumericType.LONG : result;
754 }
755 }